#ifndef CUFFTDX_FFT_1000_FP32_INV_PTX_HPP
#define CUFFTDX_FFT_1000_FP32_INV_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<395, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<671>;
.reg .b32 r<17>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
mov.u32 r2, %20;
mad.lo.s32 r3, r1, 8000, r2;
mov.u32 r4, %tid.x;
add.f32 f41, %28, %44;
add.f32 f42, %23, f41;
add.f32 f43, %33, %39;
add.f32 f44, f43, f42;
add.f32 f45, %30, %46;
add.f32 f46, %24, f45;
add.f32 f47, %35, %40;
add.f32 f48, f47, f46;
fma.rn.f32 f49, f41, 0f3E9E377A, %23;
mul.f32 f50, f43, 0f3F4F1BBD;
sub.f32 f51, f49, f50;
sub.f32 f52, %30, %46;
mul.f32 f53, f52, 0f3F737871;
sub.f32 f54, %35, %40;
fma.rn.f32 f55, f54, 0f3F167918, f53;
sub.f32 f56, f51, f55;
add.f32 f57, f55, f51;
mul.f32 f58, f41, 0f3F4F1BBD;
sub.f32 f59, %23, f58;
fma.rn.f32 f60, f43, 0f3E9E377A, f59;
mul.f32 f61, f52, 0f3F167918;
mul.f32 f62, f54, 0f3F737871;
sub.f32 f63, f61, f62;
sub.f32 f64, f60, f63;
add.f32 f65, f63, f60;
fma.rn.f32 f66, f45, 0f3E9E377A, %24;
mul.f32 f67, f47, 0f3F4F1BBD;
sub.f32 f68, f66, f67;
sub.f32 f69, %28, %44;
mul.f32 f70, f69, 0f3F737871;
sub.f32 f71, %33, %39;
fma.rn.f32 f72, f71, 0f3F167918, f70;
add.f32 f73, f72, f68;
sub.f32 f74, f68, f72;
mul.f32 f75, f45, 0f3F4F1BBD;
sub.f32 f76, %24, f75;
fma.rn.f32 f77, f47, 0f3E9E377A, f76;
mul.f32 f78, f69, 0f3F167918;
mul.f32 f79, f71, 0f3F737871;
sub.f32 f80, f78, f79;
add.f32 f81, f80, f77;
sub.f32 f82, f77, f80;
add.f32 f83, %31, %47;
add.f32 f84, %25, f83;
add.f32 f85, %36, %41;
add.f32 f86, f85, f84;
add.f32 f87, %32, %48;
add.f32 f88, %27, f87;
add.f32 f89, %38, %43;
add.f32 f90, f89, f88;
fma.rn.f32 f91, f83, 0f3E9E377A, %25;
mul.f32 f92, f85, 0f3F4F1BBD;
sub.f32 f93, f91, f92;
sub.f32 f94, %32, %48;
mul.f32 f95, f94, 0f3F737871;
sub.f32 f96, %38, %43;
fma.rn.f32 f97, f96, 0f3F167918, f95;
sub.f32 f98, f93, f97;
add.f32 f99, f97, f93;
mul.f32 f100, f83, 0f3F4F1BBD;
sub.f32 f101, %25, f100;
fma.rn.f32 f102, f85, 0f3E9E377A, f101;
mul.f32 f103, f94, 0f3F167918;
mul.f32 f104, f96, 0f3F737871;
sub.f32 f105, f103, f104;
sub.f32 f106, f102, f105;
add.f32 f107, f105, f102;
fma.rn.f32 f108, f87, 0f3E9E377A, %27;
mul.f32 f109, f89, 0f3F4F1BBD;
sub.f32 f110, f108, f109;
sub.f32 f111, %31, %47;
mul.f32 f112, f111, 0f3F737871;
sub.f32 f113, %36, %41;
fma.rn.f32 f114, f113, 0f3F167918, f112;
add.f32 f115, f114, f110;
sub.f32 f116, f110, f114;
mul.f32 f117, f87, 0f3F4F1BBD;
sub.f32 f118, %27, f117;
fma.rn.f32 f119, f89, 0f3E9E377A, f118;
mul.f32 f120, f111, 0f3F167918;
mul.f32 f121, f113, 0f3F737871;
sub.f32 f122, f120, f121;
add.f32 f123, f122, f119;
sub.f32 f124, f119, f122;
mul.f32 f125, f98, 0f3F4F1BBD;
mul.f32 f126, f115, 0f3F167918;
sub.f32 f127, f125, f126;
mul.f32 f128, f115, 0f3F4F1BBD;
fma.rn.f32 f129, f98, 0f3F167918, f128;
mul.f32 f130, f106, 0f3E9E377A;
mul.f32 f131, f123, 0f3F737871;
sub.f32 f132, f130, f131;
mul.f32 f133, f123, 0f3E9E377A;
fma.rn.f32 f134, f106, 0f3F737871, f133;
mul.f32 f135, f107, 0fBE9E377A;
mul.f32 f136, f124, 0f3F737871;
sub.f32 f137, f135, f136;
mul.f32 f138, f124, 0fBE9E377A;
fma.rn.f32 f139, f107, 0f3F737871, f138;
mul.f32 f140, f99, 0fBF4F1BBD;
mul.f32 f141, f116, 0f3F167918;
sub.f32 f142, f140, f141;
mul.f32 f143, f116, 0fBF4F1BBD;
fma.rn.f32 f144, f99, 0f3F167918, f143;
sub.f32 f145, f44, f86;
sub.f32 f146, f48, f90;
add.f32 f147, f56, f127;
add.f32 f148, f73, f129;
sub.f32 f149, f56, f127;
sub.f32 f150, f73, f129;
add.f32 f151, f64, f132;
add.f32 f152, f81, f134;
sub.f32 f153, f64, f132;
sub.f32 f154, f81, f134;
add.f32 f155, f65, f137;
add.f32 f156, f82, f139;
sub.f32 f157, f65, f137;
sub.f32 f158, f82, f139;
add.f32 f159, f57, f142;
add.f32 f160, f74, f144;
sub.f32 f161, f57, f142;
sub.f32 f162, f74, f144;
mul.wide.u32 rd2, r4, 1374389535;
shr.u64 rd3, rd2, 37;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 100;
sub.s32 r7, r4, r6;
mad.lo.s32 r8, r5, 8000, r3;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %21;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f163, f164}, [rd6];
mul.f32 f167, f148, f164;
mul.f32 f168, f147, f164;
mul.f32 f169, f163, f148;
mul.f32 f170, f163, f163;
mul.f32 f171, f164, f164;
sub.f32 f172, f170, f171;
mul.f32 f173, f164, f163;
fma.rn.f32 f174, f164, f163, f173;
mul.f32 f175, f152, f174;
mul.f32 f176, f151, f174;
mul.f32 f177, f172, f152;
mul.f32 f178, f163, f172;
mul.f32 f179, f164, f174;
sub.f32 f180, f178, f179;
mul.f32 f181, f163, f174;
fma.rn.f32 f182, f164, f172, f181;
mul.f32 f183, f156, f182;
mul.f32 f184, f155, f182;
mul.f32 f185, f180, f156;
mul.f32 f186, f163, f180;
mul.f32 f187, f164, f182;
sub.f32 f188, f186, f187;
mul.f32 f189, f163, f182;
fma.rn.f32 f190, f164, f180, f189;
mul.f32 f191, f160, f190;
mul.f32 f192, f159, f190;
mul.f32 f193, f188, f160;
mul.f32 f194, f163, f188;
mul.f32 f195, f164, f190;
sub.f32 f196, f194, f195;
mul.f32 f197, f163, f190;
fma.rn.f32 f198, f164, f188, f197;
mul.f32 f199, f146, f198;
mul.f32 f200, f145, f198;
mul.f32 f201, f196, f146;
mul.f32 f202, f163, f196;
mul.f32 f203, f164, f198;
sub.f32 f204, f202, f203;
mul.f32 f205, f163, f198;
fma.rn.f32 f206, f164, f196, f205;
mul.f32 f207, f150, f206;
mul.f32 f208, f149, f206;
mul.f32 f209, f204, f150;
mul.f32 f210, f163, f204;
mul.f32 f211, f164, f206;
sub.f32 f212, f210, f211;
mul.f32 f213, f163, f206;
fma.rn.f32 f214, f164, f204, f213;
mul.f32 f215, f154, f214;
mul.f32 f216, f153, f214;
mul.f32 f217, f212, f154;
mul.f32 f218, f163, f212;
mul.f32 f219, f164, f214;
sub.f32 f220, f218, f219;
mul.f32 f221, f163, f214;
fma.rn.f32 f222, f164, f212, f221;
mul.f32 f223, f158, f222;
mul.f32 f224, f157, f222;
mul.f32 f225, f220, f158;
mul.f32 f226, f163, f220;
mul.f32 f227, f164, f222;
sub.f32 f228, f226, f227;
mul.f32 f229, f163, f222;
fma.rn.f32 f230, f164, f220, f229;
mul.f32 f231, f162, f230;
mul.f32 f232, f161, f230;
mul.f32 f233, f228, f162;
barrier.sync 0;
mad.lo.s32 r9, r7, 80, r8;
add.f32 f234, f48, f90;
add.f32 f235, f44, f86;
st.shared.v2.f32 [r9], {f235, f234};
fma.rn.f32 f236, f163, f147, f167;
sub.f32 f237, f169, f168;
st.shared.v2.f32 [r9+8], {f236, f237};
fma.rn.f32 f238, f172, f151, f175;
sub.f32 f239, f177, f176;
st.shared.v2.f32 [r9+16], {f238, f239};
fma.rn.f32 f240, f180, f155, f183;
sub.f32 f241, f185, f184;
st.shared.v2.f32 [r9+24], {f240, f241};
sub.f32 f242, f193, f192;
fma.rn.f32 f243, f188, f159, f191;
st.shared.v2.f32 [r9+32], {f243, f242};
fma.rn.f32 f244, f196, f145, f199;
sub.f32 f245, f201, f200;
st.shared.v2.f32 [r9+40], {f244, f245};
fma.rn.f32 f246, f204, f149, f207;
sub.f32 f247, f209, f208;
st.shared.v2.f32 [r9+48], {f246, f247};
fma.rn.f32 f248, f212, f153, f215;
sub.f32 f249, f217, f216;
st.shared.v2.f32 [r9+56], {f248, f249};
fma.rn.f32 f250, f220, f157, f223;
sub.f32 f251, f225, f224;
st.shared.v2.f32 [r9+64], {f250, f251};
fma.rn.f32 f252, f228, f161, f231;
sub.f32 f253, f233, f232;
st.shared.v2.f32 [r9+72], {f252, f253};
barrier.sync 0;
mad.lo.s32 r10, r7, -72, r9;
ld.shared.v2.f32 {f254, f255}, [r10];
ld.shared.v2.f32 {f258, f259}, [r10+800];
ld.shared.v2.f32 {f262, f263}, [r10+1600];
ld.shared.v2.f32 {f266, f267}, [r10+2400];
ld.shared.v2.f32 {f270, f271}, [r10+3200];
ld.shared.v2.f32 {f274, f275}, [r10+4000];
ld.shared.v2.f32 {f278, f279}, [r10+4800];
ld.shared.v2.f32 {f282, f283}, [r10+5600];
ld.shared.v2.f32 {f286, f287}, [r10+6400];
ld.shared.v2.f32 {f290, f291}, [r10+7200];
add.f32 f294, f262, f286;
add.f32 f295, f254, f294;
add.f32 f296, f270, f278;
add.f32 f297, f296, f295;
add.f32 f298, f263, f287;
add.f32 f299, f255, f298;
add.f32 f300, f271, f279;
add.f32 f301, f300, f299;
fma.rn.f32 f302, f294, 0f3E9E377A, f254;
mul.f32 f303, f296, 0f3F4F1BBD;
sub.f32 f304, f302, f303;
sub.f32 f305, f263, f287;
mul.f32 f306, f305, 0f3F737871;
sub.f32 f307, f271, f279;
fma.rn.f32 f308, f307, 0f3F167918, f306;
sub.f32 f309, f304, f308;
add.f32 f310, f308, f304;
mul.f32 f311, f294, 0f3F4F1BBD;
sub.f32 f312, f254, f311;
fma.rn.f32 f313, f296, 0f3E9E377A, f312;
mul.f32 f314, f305, 0f3F167918;
mul.f32 f315, f307, 0f3F737871;
sub.f32 f316, f314, f315;
sub.f32 f317, f313, f316;
add.f32 f318, f316, f313;
fma.rn.f32 f319, f298, 0f3E9E377A, f255;
mul.f32 f320, f300, 0f3F4F1BBD;
sub.f32 f321, f319, f320;
sub.f32 f322, f262, f286;
mul.f32 f323, f322, 0f3F737871;
sub.f32 f324, f270, f278;
fma.rn.f32 f325, f324, 0f3F167918, f323;
add.f32 f326, f325, f321;
sub.f32 f327, f321, f325;
mul.f32 f328, f298, 0f3F4F1BBD;
sub.f32 f329, f255, f328;
fma.rn.f32 f330, f300, 0f3E9E377A, f329;
mul.f32 f331, f322, 0f3F167918;
mul.f32 f332, f324, 0f3F737871;
sub.f32 f333, f331, f332;
add.f32 f334, f333, f330;
sub.f32 f335, f330, f333;
add.f32 f336, f266, f290;
add.f32 f337, f258, f336;
add.f32 f338, f274, f282;
add.f32 f339, f338, f337;
add.f32 f340, f267, f291;
add.f32 f341, f259, f340;
add.f32 f342, f275, f283;
add.f32 f343, f342, f341;
fma.rn.f32 f344, f336, 0f3E9E377A, f258;
mul.f32 f345, f338, 0f3F4F1BBD;
sub.f32 f346, f344, f345;
sub.f32 f347, f267, f291;
mul.f32 f348, f347, 0f3F737871;
sub.f32 f349, f275, f283;
fma.rn.f32 f350, f349, 0f3F167918, f348;
sub.f32 f351, f346, f350;
add.f32 f352, f350, f346;
mul.f32 f353, f336, 0f3F4F1BBD;
sub.f32 f354, f258, f353;
fma.rn.f32 f355, f338, 0f3E9E377A, f354;
mul.f32 f356, f347, 0f3F167918;
mul.f32 f357, f349, 0f3F737871;
sub.f32 f358, f356, f357;
sub.f32 f359, f355, f358;
add.f32 f360, f358, f355;
fma.rn.f32 f361, f340, 0f3E9E377A, f259;
mul.f32 f362, f342, 0f3F4F1BBD;
sub.f32 f363, f361, f362;
sub.f32 f364, f266, f290;
mul.f32 f365, f364, 0f3F737871;
sub.f32 f366, f274, f282;
fma.rn.f32 f367, f366, 0f3F167918, f365;
add.f32 f368, f367, f363;
sub.f32 f369, f363, f367;
mul.f32 f370, f340, 0f3F4F1BBD;
sub.f32 f371, f259, f370;
fma.rn.f32 f372, f342, 0f3E9E377A, f371;
mul.f32 f373, f364, 0f3F167918;
mul.f32 f374, f366, 0f3F737871;
sub.f32 f375, f373, f374;
add.f32 f376, f375, f372;
sub.f32 f377, f372, f375;
mul.f32 f378, f351, 0f3F4F1BBD;
mul.f32 f379, f368, 0f3F167918;
sub.f32 f380, f378, f379;
mul.f32 f381, f368, 0f3F4F1BBD;
fma.rn.f32 f382, f351, 0f3F167918, f381;
mul.f32 f383, f359, 0f3E9E377A;
mul.f32 f384, f376, 0f3F737871;
sub.f32 f385, f383, f384;
mul.f32 f386, f376, 0f3E9E377A;
fma.rn.f32 f387, f359, 0f3F737871, f386;
mul.f32 f388, f360, 0fBE9E377A;
mul.f32 f389, f377, 0f3F737871;
sub.f32 f390, f388, f389;
mul.f32 f391, f377, 0fBE9E377A;
fma.rn.f32 f392, f360, 0f3F737871, f391;
mul.f32 f393, f352, 0fBF4F1BBD;
mul.f32 f394, f369, 0f3F167918;
sub.f32 f395, f393, f394;
mul.f32 f396, f369, 0fBF4F1BBD;
fma.rn.f32 f397, f352, 0f3F167918, f396;
sub.f32 f398, f297, f339;
sub.f32 f399, f301, f343;
add.f32 f400, f309, f380;
add.f32 f401, f326, f382;
sub.f32 f402, f309, f380;
sub.f32 f403, f326, f382;
add.f32 f404, f317, f385;
add.f32 f405, f334, f387;
sub.f32 f406, f317, f385;
sub.f32 f407, f334, f387;
add.f32 f408, f318, f390;
add.f32 f409, f335, f392;
sub.f32 f410, f318, f390;
sub.f32 f411, f335, f392;
add.f32 f412, f310, f395;
add.f32 f413, f327, f397;
sub.f32 f414, f310, f395;
sub.f32 f415, f327, f397;
mul.wide.u32 rd7, r7, -858993459;
shr.u64 rd8, rd7, 35;
cvt.u32.u64 r11, rd8;
mul.lo.s32 r12, r11, 10;
sub.s32 r13, r7, r12;
mul.wide.u32 rd9, r11, 8;
mov.u64 rd10, %22;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f416, f417}, [rd11];
mul.f32 f420, f401, f417;
mul.f32 f421, f400, f417;
mul.f32 f422, f416, f401;
mul.f32 f423, f416, f416;
mul.f32 f424, f417, f417;
sub.f32 f425, f423, f424;
mul.f32 f426, f417, f416;
fma.rn.f32 f427, f417, f416, f426;
mul.f32 f428, f405, f427;
mul.f32 f429, f404, f427;
mul.f32 f430, f425, f405;
mul.f32 f431, f416, f425;
mul.f32 f432, f417, f427;
sub.f32 f433, f431, f432;
mul.f32 f434, f416, f427;
fma.rn.f32 f435, f417, f425, f434;
mul.f32 f436, f409, f435;
mul.f32 f437, f408, f435;
mul.f32 f438, f433, f409;
mul.f32 f439, f416, f433;
mul.f32 f440, f417, f435;
sub.f32 f441, f439, f440;
mul.f32 f442, f416, f435;
fma.rn.f32 f443, f417, f433, f442;
mul.f32 f444, f413, f443;
mul.f32 f445, f412, f443;
mul.f32 f446, f441, f413;
mul.f32 f447, f416, f441;
mul.f32 f448, f417, f443;
sub.f32 f449, f447, f448;
mul.f32 f450, f416, f443;
fma.rn.f32 f451, f417, f441, f450;
mul.f32 f452, f399, f451;
mul.f32 f453, f398, f451;
mul.f32 f454, f449, f399;
mul.f32 f455, f416, f449;
mul.f32 f456, f417, f451;
sub.f32 f457, f455, f456;
mul.f32 f458, f416, f451;
fma.rn.f32 f459, f417, f449, f458;
mul.f32 f460, f403, f459;
mul.f32 f461, f402, f459;
mul.f32 f462, f457, f403;
mul.f32 f463, f416, f457;
mul.f32 f464, f417, f459;
sub.f32 f465, f463, f464;
mul.f32 f466, f416, f459;
fma.rn.f32 f467, f417, f457, f466;
mul.f32 f468, f407, f467;
mul.f32 f469, f406, f467;
mul.f32 f470, f465, f407;
mul.f32 f471, f416, f465;
mul.f32 f472, f417, f467;
sub.f32 f473, f471, f472;
mul.f32 f474, f416, f467;
fma.rn.f32 f475, f417, f465, f474;
mul.f32 f476, f411, f475;
mul.f32 f477, f410, f475;
mul.f32 f478, f473, f411;
mul.f32 f479, f416, f473;
mul.f32 f480, f417, f475;
sub.f32 f481, f479, f480;
mul.f32 f482, f416, f475;
fma.rn.f32 f483, f417, f473, f482;
mul.f32 f484, f415, f483;
mul.f32 f485, f414, f483;
mul.f32 f486, f481, f415;
shl.b32 r14, r13, 3;
add.s32 r15, r8, r14;
barrier.sync 0;
mad.lo.s32 r16, r11, 800, r15;
add.f32 f487, f301, f343;
add.f32 f488, f297, f339;
st.shared.v2.f32 [r16], {f488, f487};
fma.rn.f32 f489, f416, f400, f420;
sub.f32 f490, f422, f421;
st.shared.v2.f32 [r16+80], {f489, f490};
fma.rn.f32 f491, f425, f404, f428;
sub.f32 f492, f430, f429;
st.shared.v2.f32 [r16+160], {f491, f492};
fma.rn.f32 f493, f433, f408, f436;
sub.f32 f494, f438, f437;
st.shared.v2.f32 [r16+240], {f493, f494};
fma.rn.f32 f495, f441, f412, f444;
sub.f32 f496, f446, f445;
st.shared.v2.f32 [r16+320], {f495, f496};
sub.f32 f497, f454, f453;
fma.rn.f32 f498, f449, f398, f452;
st.shared.v2.f32 [r16+400], {f498, f497};
sub.f32 f499, f462, f461;
fma.rn.f32 f500, f457, f402, f460;
st.shared.v2.f32 [r16+480], {f500, f499};
fma.rn.f32 f501, f465, f406, f468;
sub.f32 f502, f470, f469;
st.shared.v2.f32 [r16+560], {f501, f502};
fma.rn.f32 f503, f473, f410, f476;
sub.f32 f504, f478, f477;
st.shared.v2.f32 [r16+640], {f503, f504};
fma.rn.f32 f505, f481, f414, f484;
sub.f32 f506, f486, f485;
st.shared.v2.f32 [r16+720], {f505, f506};
barrier.sync 0;
ld.shared.v2.f32 {f507, f508}, [r10];
ld.shared.v2.f32 {f511, f512}, [r10+800];
ld.shared.v2.f32 {f515, f516}, [r10+1600];
ld.shared.v2.f32 {f519, f520}, [r10+2400];
ld.shared.v2.f32 {f523, f524}, [r10+3200];
ld.shared.v2.f32 {f527, f528}, [r10+4000];
ld.shared.v2.f32 {f531, f532}, [r10+4800];
ld.shared.v2.f32 {f535, f536}, [r10+5600];
ld.shared.v2.f32 {f539, f540}, [r10+6400];
ld.shared.v2.f32 {f543, f544}, [r10+7200];
add.f32 f547, f515, f539;
add.f32 f548, f507, f547;
add.f32 f549, f523, f531;
add.f32 f550, f549, f548;
add.f32 f551, f516, f540;
add.f32 f552, f508, f551;
add.f32 f553, f524, f532;
add.f32 f554, f553, f552;
fma.rn.f32 f555, f547, 0f3E9E377A, f507;
mul.f32 f556, f549, 0f3F4F1BBD;
sub.f32 f557, f555, f556;
sub.f32 f558, f516, f540;
mul.f32 f559, f558, 0f3F737871;
sub.f32 f560, f524, f532;
fma.rn.f32 f561, f560, 0f3F167918, f559;
sub.f32 f562, f557, f561;
add.f32 f563, f561, f557;
mul.f32 f564, f547, 0f3F4F1BBD;
sub.f32 f565, f507, f564;
fma.rn.f32 f566, f549, 0f3E9E377A, f565;
mul.f32 f567, f558, 0f3F167918;
mul.f32 f568, f560, 0f3F737871;
sub.f32 f569, f567, f568;
sub.f32 f570, f566, f569;
add.f32 f571, f569, f566;
fma.rn.f32 f572, f551, 0f3E9E377A, f508;
mul.f32 f573, f553, 0f3F4F1BBD;
sub.f32 f574, f572, f573;
sub.f32 f575, f515, f539;
mul.f32 f576, f575, 0f3F737871;
sub.f32 f577, f523, f531;
fma.rn.f32 f578, f577, 0f3F167918, f576;
add.f32 f579, f578, f574;
sub.f32 f580, f574, f578;
mul.f32 f581, f551, 0f3F4F1BBD;
sub.f32 f582, f508, f581;
fma.rn.f32 f583, f553, 0f3E9E377A, f582;
mul.f32 f584, f575, 0f3F167918;
mul.f32 f585, f577, 0f3F737871;
sub.f32 f586, f584, f585;
add.f32 f587, f586, f583;
sub.f32 f588, f583, f586;
add.f32 f589, f519, f543;
add.f32 f590, f511, f589;
add.f32 f591, f527, f535;
add.f32 f592, f591, f590;
add.f32 f593, f520, f544;
add.f32 f594, f512, f593;
add.f32 f595, f528, f536;
add.f32 f596, f595, f594;
fma.rn.f32 f597, f589, 0f3E9E377A, f511;
mul.f32 f598, f591, 0f3F4F1BBD;
sub.f32 f599, f597, f598;
sub.f32 f600, f520, f544;
mul.f32 f601, f600, 0f3F737871;
sub.f32 f602, f528, f536;
fma.rn.f32 f603, f602, 0f3F167918, f601;
sub.f32 f604, f599, f603;
add.f32 f605, f603, f599;
mul.f32 f606, f589, 0f3F4F1BBD;
sub.f32 f607, f511, f606;
fma.rn.f32 f608, f591, 0f3E9E377A, f607;
mul.f32 f609, f600, 0f3F167918;
mul.f32 f610, f602, 0f3F737871;
sub.f32 f611, f609, f610;
sub.f32 f612, f608, f611;
add.f32 f613, f611, f608;
fma.rn.f32 f614, f593, 0f3E9E377A, f512;
mul.f32 f615, f595, 0f3F4F1BBD;
sub.f32 f616, f614, f615;
sub.f32 f617, f519, f543;
mul.f32 f618, f617, 0f3F737871;
sub.f32 f619, f527, f535;
fma.rn.f32 f620, f619, 0f3F167918, f618;
add.f32 f621, f620, f616;
sub.f32 f622, f616, f620;
mul.f32 f623, f593, 0f3F4F1BBD;
sub.f32 f624, f512, f623;
fma.rn.f32 f625, f595, 0f3E9E377A, f624;
mul.f32 f626, f617, 0f3F167918;
mul.f32 f627, f619, 0f3F737871;
sub.f32 f628, f626, f627;
add.f32 f629, f628, f625;
sub.f32 f630, f625, f628;
mul.f32 f631, f604, 0f3F4F1BBD;
mul.f32 f632, f621, 0f3F167918;
sub.f32 f633, f631, f632;
mul.f32 f634, f621, 0f3F4F1BBD;
fma.rn.f32 f635, f604, 0f3F167918, f634;
mul.f32 f636, f612, 0f3E9E377A;
mul.f32 f637, f629, 0f3F737871;
sub.f32 f638, f636, f637;
mul.f32 f639, f629, 0f3E9E377A;
fma.rn.f32 f640, f612, 0f3F737871, f639;
mul.f32 f641, f613, 0fBE9E377A;
mul.f32 f642, f630, 0f3F737871;
sub.f32 f643, f641, f642;
mul.f32 f644, f630, 0fBE9E377A;
fma.rn.f32 f645, f613, 0f3F737871, f644;
mul.f32 f646, f605, 0fBF4F1BBD;
mul.f32 f647, f622, 0f3F167918;
sub.f32 f648, f646, f647;
mul.f32 f649, f622, 0fBF4F1BBD;
fma.rn.f32 f650, f605, 0f3F167918, f649;
add.f32 %1, f554, f596;
add.f32 %0, f550, f592;
add.f32 %3, f579, f635;
add.f32 %2, f562, f633;
add.f32 %5, f587, f640;
add.f32 %4, f570, f638;
add.f32 %7, f588, f645;
add.f32 %6, f571, f643;
add.f32 %9, f580, f650;
add.f32 %8, f563, f648;
sub.f32 %11, f554, f596;
sub.f32 %10, f550, f592;
sub.f32 %13, f579, f635;
sub.f32 %12, f562, f633;
sub.f32 %15, f587, f640;
sub.f32 %14, f570, f638;
sub.f32 %17, f588, f645;
sub.f32 %16, f571, f643;
sub.f32 %19, f580, f650;
sub.f32 %18, f563, f648;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<396, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<631>;
.reg .b32 r<17>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
mov.u32 r2, %20;
mad.lo.s32 r3, r1, 4000, r2;
mov.u32 r4, %tid.x;
add.f32 f41, %28, %44;
add.f32 f42, %23, f41;
add.f32 f43, %33, %39;
add.f32 f44, f43, f42;
add.f32 f45, %30, %46;
add.f32 f46, %24, f45;
add.f32 f47, %35, %40;
add.f32 f48, f47, f46;
fma.rn.f32 f49, f41, 0f3E9E377A, %23;
mul.f32 f50, f43, 0f3F4F1BBD;
sub.f32 f51, f49, f50;
sub.f32 f52, %30, %46;
mul.f32 f53, f52, 0f3F737871;
sub.f32 f54, %35, %40;
fma.rn.f32 f55, f54, 0f3F167918, f53;
sub.f32 f56, f51, f55;
add.f32 f57, f55, f51;
mul.f32 f58, f41, 0f3F4F1BBD;
sub.f32 f59, %23, f58;
fma.rn.f32 f60, f43, 0f3E9E377A, f59;
mul.f32 f61, f52, 0f3F167918;
mul.f32 f62, f54, 0f3F737871;
sub.f32 f63, f61, f62;
sub.f32 f64, f60, f63;
add.f32 f65, f63, f60;
fma.rn.f32 f66, f45, 0f3E9E377A, %24;
mul.f32 f67, f47, 0f3F4F1BBD;
sub.f32 f68, f66, f67;
sub.f32 f69, %28, %44;
mul.f32 f70, f69, 0f3F737871;
sub.f32 f71, %33, %39;
fma.rn.f32 f72, f71, 0f3F167918, f70;
add.f32 f73, f72, f68;
sub.f32 f74, f68, f72;
mul.f32 f75, f45, 0f3F4F1BBD;
sub.f32 f76, %24, f75;
fma.rn.f32 f77, f47, 0f3E9E377A, f76;
mul.f32 f78, f69, 0f3F167918;
mul.f32 f79, f71, 0f3F737871;
sub.f32 f80, f78, f79;
add.f32 f81, f80, f77;
sub.f32 f82, f77, f80;
add.f32 f83, %31, %47;
add.f32 f84, %25, f83;
add.f32 f85, %36, %41;
add.f32 f86, f85, f84;
add.f32 f87, %32, %48;
add.f32 f88, %27, f87;
add.f32 f89, %38, %43;
add.f32 f90, f89, f88;
fma.rn.f32 f91, f83, 0f3E9E377A, %25;
mul.f32 f92, f85, 0f3F4F1BBD;
sub.f32 f93, f91, f92;
sub.f32 f94, %32, %48;
mul.f32 f95, f94, 0f3F737871;
sub.f32 f96, %38, %43;
fma.rn.f32 f97, f96, 0f3F167918, f95;
sub.f32 f98, f93, f97;
add.f32 f99, f97, f93;
mul.f32 f100, f83, 0f3F4F1BBD;
sub.f32 f101, %25, f100;
fma.rn.f32 f102, f85, 0f3E9E377A, f101;
mul.f32 f103, f94, 0f3F167918;
mul.f32 f104, f96, 0f3F737871;
sub.f32 f105, f103, f104;
sub.f32 f106, f102, f105;
add.f32 f107, f105, f102;
fma.rn.f32 f108, f87, 0f3E9E377A, %27;
mul.f32 f109, f89, 0f3F4F1BBD;
sub.f32 f110, f108, f109;
sub.f32 f111, %31, %47;
mul.f32 f112, f111, 0f3F737871;
sub.f32 f113, %36, %41;
fma.rn.f32 f114, f113, 0f3F167918, f112;
add.f32 f115, f114, f110;
sub.f32 f116, f110, f114;
mul.f32 f117, f87, 0f3F4F1BBD;
sub.f32 f118, %27, f117;
fma.rn.f32 f119, f89, 0f3E9E377A, f118;
mul.f32 f120, f111, 0f3F167918;
mul.f32 f121, f113, 0f3F737871;
sub.f32 f122, f120, f121;
add.f32 f123, f122, f119;
sub.f32 f124, f119, f122;
mul.f32 f125, f98, 0f3F4F1BBD;
mul.f32 f126, f115, 0f3F167918;
sub.f32 f127, f125, f126;
mul.f32 f128, f115, 0f3F4F1BBD;
fma.rn.f32 f129, f98, 0f3F167918, f128;
mul.f32 f130, f106, 0f3E9E377A;
mul.f32 f131, f123, 0f3F737871;
sub.f32 f132, f130, f131;
mul.f32 f133, f123, 0f3E9E377A;
fma.rn.f32 f134, f106, 0f3F737871, f133;
mul.f32 f135, f107, 0fBE9E377A;
mul.f32 f136, f124, 0f3F737871;
sub.f32 f137, f135, f136;
mul.f32 f138, f124, 0fBE9E377A;
fma.rn.f32 f139, f107, 0f3F737871, f138;
mul.f32 f140, f99, 0fBF4F1BBD;
mul.f32 f141, f116, 0f3F167918;
sub.f32 f142, f140, f141;
mul.f32 f143, f116, 0fBF4F1BBD;
fma.rn.f32 f144, f99, 0f3F167918, f143;
add.f32 f145, f44, f86;
add.f32 f146, f48, f90;
sub.f32 f147, f44, f86;
sub.f32 f148, f48, f90;
add.f32 f149, f56, f127;
add.f32 f150, f73, f129;
sub.f32 f151, f56, f127;
sub.f32 f152, f73, f129;
add.f32 f153, f64, f132;
add.f32 f154, f81, f134;
sub.f32 f155, f64, f132;
sub.f32 f156, f81, f134;
add.f32 f157, f65, f137;
add.f32 f158, f82, f139;
sub.f32 f159, f65, f137;
sub.f32 f160, f82, f139;
add.f32 f161, f57, f142;
add.f32 f162, f74, f144;
sub.f32 f163, f57, f142;
sub.f32 f164, f74, f144;
mul.wide.u32 rd2, r4, 1374389535;
shr.u64 rd3, rd2, 37;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 100;
sub.s32 r7, r4, r6;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %21;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f165, f166}, [rd6];
mul.f32 f169, f150, f166;
fma.rn.f32 f170, f165, f149, f169;
mul.f32 f171, f149, f166;
mul.f32 f172, f165, f150;
sub.f32 f173, f172, f171;
mul.f32 f174, f165, f165;
mul.f32 f175, f166, f166;
sub.f32 f176, f174, f175;
mul.f32 f177, f166, f165;
fma.rn.f32 f178, f166, f165, f177;
mul.f32 f179, f154, f178;
fma.rn.f32 f180, f176, f153, f179;
mul.f32 f181, f153, f178;
mul.f32 f182, f176, f154;
sub.f32 f183, f182, f181;
mul.f32 f184, f165, f176;
mul.f32 f185, f166, f178;
sub.f32 f186, f184, f185;
mul.f32 f187, f165, f178;
fma.rn.f32 f188, f166, f176, f187;
mul.f32 f189, f158, f188;
fma.rn.f32 f190, f186, f157, f189;
mul.f32 f191, f157, f188;
mul.f32 f192, f186, f158;
sub.f32 f193, f192, f191;
mul.f32 f194, f165, f186;
mul.f32 f195, f166, f188;
sub.f32 f196, f194, f195;
mul.f32 f197, f165, f188;
fma.rn.f32 f198, f166, f186, f197;
mul.f32 f199, f162, f198;
fma.rn.f32 f200, f196, f161, f199;
mul.f32 f201, f161, f198;
mul.f32 f202, f196, f162;
sub.f32 f203, f202, f201;
mul.f32 f204, f165, f196;
mul.f32 f205, f166, f198;
sub.f32 f206, f204, f205;
mul.f32 f207, f165, f198;
fma.rn.f32 f208, f166, f196, f207;
mul.f32 f209, f148, f208;
fma.rn.f32 f210, f206, f147, f209;
mul.f32 f211, f147, f208;
mul.f32 f212, f206, f148;
sub.f32 f213, f212, f211;
mul.f32 f214, f165, f206;
mul.f32 f215, f166, f208;
sub.f32 f216, f214, f215;
mul.f32 f217, f165, f208;
fma.rn.f32 f218, f166, f206, f217;
mul.f32 f219, f152, f218;
fma.rn.f32 f220, f216, f151, f219;
mul.f32 f221, f151, f218;
mul.f32 f222, f216, f152;
sub.f32 f223, f222, f221;
mul.f32 f224, f165, f216;
mul.f32 f225, f166, f218;
sub.f32 f226, f224, f225;
mul.f32 f227, f165, f218;
fma.rn.f32 f228, f166, f216, f227;
mul.f32 f229, f156, f228;
fma.rn.f32 f230, f226, f155, f229;
mul.f32 f231, f155, f228;
mul.f32 f232, f226, f156;
sub.f32 f233, f232, f231;
mul.f32 f234, f165, f226;
mul.f32 f235, f166, f228;
sub.f32 f236, f234, f235;
mul.f32 f237, f165, f228;
fma.rn.f32 f238, f166, f226, f237;
mul.f32 f239, f160, f238;
fma.rn.f32 f240, f236, f159, f239;
mul.f32 f241, f159, f238;
mul.f32 f242, f236, f160;
sub.f32 f243, f242, f241;
mul.f32 f244, f165, f236;
mul.f32 f245, f166, f238;
sub.f32 f246, f244, f245;
mul.f32 f247, f165, f238;
fma.rn.f32 f248, f166, f236, f247;
mul.f32 f249, f164, f248;
fma.rn.f32 f250, f246, f163, f249;
mul.f32 f251, f163, f248;
mul.f32 f252, f246, f164;
sub.f32 f253, f252, f251;
mad.lo.s32 r8, r5, 4000, r3;
barrier.sync 0;
mad.lo.s32 r9, r7, 40, r8;
st.shared.v2.f32 [r9], {f145, f170};
st.shared.v2.f32 [r9+8], {f180, f190};
st.shared.v2.f32 [r9+16], {f200, f210};
st.shared.v2.f32 [r9+24], {f220, f230};
st.shared.v2.f32 [r9+32], {f240, f250};
barrier.sync 0;
mad.lo.s32 r10, r7, -36, r9;
ld.shared.f32 f254, [r10];
ld.shared.f32 f255, [r10+400];
ld.shared.f32 f256, [r10+800];
ld.shared.f32 f257, [r10+1200];
ld.shared.f32 f258, [r10+1600];
ld.shared.f32 f259, [r10+2000];
ld.shared.f32 f260, [r10+2400];
ld.shared.f32 f261, [r10+2800];
ld.shared.f32 f262, [r10+3200];
ld.shared.f32 f263, [r10+3600];
barrier.sync 0;
st.shared.v2.f32 [r9], {f146, f173};
st.shared.v2.f32 [r9+8], {f183, f193};
st.shared.v2.f32 [r9+16], {f203, f213};
st.shared.v2.f32 [r9+24], {f223, f233};
st.shared.v2.f32 [r9+32], {f243, f253};
barrier.sync 0;
ld.shared.f32 f264, [r10];
ld.shared.f32 f265, [r10+400];
ld.shared.f32 f266, [r10+800];
ld.shared.f32 f267, [r10+1200];
ld.shared.f32 f268, [r10+1600];
ld.shared.f32 f269, [r10+2000];
ld.shared.f32 f270, [r10+2400];
ld.shared.f32 f271, [r10+2800];
ld.shared.f32 f272, [r10+3200];
ld.shared.f32 f273, [r10+3600];
add.f32 f274, f256, f262;
add.f32 f275, f254, f274;
add.f32 f276, f258, f260;
add.f32 f277, f276, f275;
add.f32 f278, f266, f272;
add.f32 f279, f264, f278;
add.f32 f280, f268, f270;
add.f32 f281, f280, f279;
fma.rn.f32 f282, f274, 0f3E9E377A, f254;
mul.f32 f283, f276, 0f3F4F1BBD;
sub.f32 f284, f282, f283;
sub.f32 f285, f266, f272;
mul.f32 f286, f285, 0f3F737871;
sub.f32 f287, f268, f270;
fma.rn.f32 f288, f287, 0f3F167918, f286;
sub.f32 f289, f284, f288;
add.f32 f290, f288, f284;
mul.f32 f291, f274, 0f3F4F1BBD;
sub.f32 f292, f254, f291;
fma.rn.f32 f293, f276, 0f3E9E377A, f292;
mul.f32 f294, f285, 0f3F167918;
mul.f32 f295, f287, 0f3F737871;
sub.f32 f296, f294, f295;
sub.f32 f297, f293, f296;
add.f32 f298, f296, f293;
fma.rn.f32 f299, f278, 0f3E9E377A, f264;
mul.f32 f300, f280, 0f3F4F1BBD;
sub.f32 f301, f299, f300;
sub.f32 f302, f256, f262;
mul.f32 f303, f302, 0f3F737871;
sub.f32 f304, f258, f260;
fma.rn.f32 f305, f304, 0f3F167918, f303;
add.f32 f306, f305, f301;
sub.f32 f307, f301, f305;
mul.f32 f308, f278, 0f3F4F1BBD;
sub.f32 f309, f264, f308;
fma.rn.f32 f310, f280, 0f3E9E377A, f309;
mul.f32 f311, f302, 0f3F167918;
mul.f32 f312, f304, 0f3F737871;
sub.f32 f313, f311, f312;
add.f32 f314, f313, f310;
sub.f32 f315, f310, f313;
add.f32 f316, f257, f263;
add.f32 f317, f255, f316;
add.f32 f318, f259, f261;
add.f32 f319, f318, f317;
add.f32 f320, f267, f273;
add.f32 f321, f265, f320;
add.f32 f322, f269, f271;
add.f32 f323, f322, f321;
fma.rn.f32 f324, f316, 0f3E9E377A, f255;
mul.f32 f325, f318, 0f3F4F1BBD;
sub.f32 f326, f324, f325;
sub.f32 f327, f267, f273;
mul.f32 f328, f327, 0f3F737871;
sub.f32 f329, f269, f271;
fma.rn.f32 f330, f329, 0f3F167918, f328;
sub.f32 f331, f326, f330;
add.f32 f332, f330, f326;
mul.f32 f333, f316, 0f3F4F1BBD;
sub.f32 f334, f255, f333;
fma.rn.f32 f335, f318, 0f3E9E377A, f334;
mul.f32 f336, f327, 0f3F167918;
mul.f32 f337, f329, 0f3F737871;
sub.f32 f338, f336, f337;
sub.f32 f339, f335, f338;
add.f32 f340, f338, f335;
fma.rn.f32 f341, f320, 0f3E9E377A, f265;
mul.f32 f342, f322, 0f3F4F1BBD;
sub.f32 f343, f341, f342;
sub.f32 f344, f257, f263;
mul.f32 f345, f344, 0f3F737871;
sub.f32 f346, f259, f261;
fma.rn.f32 f347, f346, 0f3F167918, f345;
add.f32 f348, f347, f343;
sub.f32 f349, f343, f347;
mul.f32 f350, f320, 0f3F4F1BBD;
sub.f32 f351, f265, f350;
fma.rn.f32 f352, f322, 0f3E9E377A, f351;
mul.f32 f353, f344, 0f3F167918;
mul.f32 f354, f346, 0f3F737871;
sub.f32 f355, f353, f354;
add.f32 f356, f355, f352;
sub.f32 f357, f352, f355;
mul.f32 f358, f331, 0f3F4F1BBD;
mul.f32 f359, f348, 0f3F167918;
sub.f32 f360, f358, f359;
mul.f32 f361, f348, 0f3F4F1BBD;
fma.rn.f32 f362, f331, 0f3F167918, f361;
mul.f32 f363, f339, 0f3E9E377A;
mul.f32 f364, f356, 0f3F737871;
sub.f32 f365, f363, f364;
mul.f32 f366, f356, 0f3E9E377A;
fma.rn.f32 f367, f339, 0f3F737871, f366;
mul.f32 f368, f340, 0fBE9E377A;
mul.f32 f369, f357, 0f3F737871;
sub.f32 f370, f368, f369;
mul.f32 f371, f357, 0fBE9E377A;
fma.rn.f32 f372, f340, 0f3F737871, f371;
mul.f32 f373, f332, 0fBF4F1BBD;
mul.f32 f374, f349, 0f3F167918;
sub.f32 f375, f373, f374;
mul.f32 f376, f349, 0fBF4F1BBD;
fma.rn.f32 f377, f332, 0f3F167918, f376;
add.f32 f378, f277, f319;
add.f32 f379, f281, f323;
sub.f32 f380, f277, f319;
sub.f32 f381, f281, f323;
add.f32 f382, f289, f360;
add.f32 f383, f306, f362;
sub.f32 f384, f289, f360;
sub.f32 f385, f306, f362;
add.f32 f386, f297, f365;
add.f32 f387, f314, f367;
sub.f32 f388, f297, f365;
sub.f32 f389, f314, f367;
add.f32 f390, f298, f370;
add.f32 f391, f315, f372;
sub.f32 f392, f298, f370;
sub.f32 f393, f315, f372;
add.f32 f394, f290, f375;
add.f32 f395, f307, f377;
sub.f32 f396, f290, f375;
sub.f32 f397, f307, f377;
mul.wide.u32 rd7, r7, -858993459;
shr.u64 rd8, rd7, 35;
cvt.u32.u64 r11, rd8;
mul.lo.s32 r12, r11, 10;
sub.s32 r13, r7, r12;
mul.wide.u32 rd9, r11, 8;
mov.u64 rd10, %22;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f398, f399}, [rd11];
mul.f32 f402, f383, f399;
fma.rn.f32 f403, f398, f382, f402;
mul.f32 f404, f382, f399;
mul.f32 f405, f398, f383;
sub.f32 f406, f405, f404;
mul.f32 f407, f398, f398;
mul.f32 f408, f399, f399;
sub.f32 f409, f407, f408;
mul.f32 f410, f399, f398;
fma.rn.f32 f411, f399, f398, f410;
mul.f32 f412, f387, f411;
fma.rn.f32 f413, f409, f386, f412;
mul.f32 f414, f386, f411;
mul.f32 f415, f409, f387;
sub.f32 f416, f415, f414;
mul.f32 f417, f398, f409;
mul.f32 f418, f399, f411;
sub.f32 f419, f417, f418;
mul.f32 f420, f398, f411;
fma.rn.f32 f421, f399, f409, f420;
mul.f32 f422, f391, f421;
fma.rn.f32 f423, f419, f390, f422;
mul.f32 f424, f390, f421;
mul.f32 f425, f419, f391;
sub.f32 f426, f425, f424;
mul.f32 f427, f398, f419;
mul.f32 f428, f399, f421;
sub.f32 f429, f427, f428;
mul.f32 f430, f398, f421;
fma.rn.f32 f431, f399, f419, f430;
mul.f32 f432, f395, f431;
fma.rn.f32 f433, f429, f394, f432;
mul.f32 f434, f394, f431;
mul.f32 f435, f429, f395;
sub.f32 f436, f435, f434;
mul.f32 f437, f398, f429;
mul.f32 f438, f399, f431;
sub.f32 f439, f437, f438;
mul.f32 f440, f398, f431;
fma.rn.f32 f441, f399, f429, f440;
mul.f32 f442, f381, f441;
fma.rn.f32 f443, f439, f380, f442;
mul.f32 f444, f380, f441;
mul.f32 f445, f439, f381;
sub.f32 f446, f445, f444;
mul.f32 f447, f398, f439;
mul.f32 f448, f399, f441;
sub.f32 f449, f447, f448;
mul.f32 f450, f398, f441;
fma.rn.f32 f451, f399, f439, f450;
mul.f32 f452, f385, f451;
fma.rn.f32 f453, f449, f384, f452;
mul.f32 f454, f384, f451;
mul.f32 f455, f449, f385;
sub.f32 f456, f455, f454;
mul.f32 f457, f398, f449;
mul.f32 f458, f399, f451;
sub.f32 f459, f457, f458;
mul.f32 f460, f398, f451;
fma.rn.f32 f461, f399, f449, f460;
mul.f32 f462, f389, f461;
fma.rn.f32 f463, f459, f388, f462;
mul.f32 f464, f388, f461;
mul.f32 f465, f459, f389;
sub.f32 f466, f465, f464;
mul.f32 f467, f398, f459;
mul.f32 f468, f399, f461;
sub.f32 f469, f467, f468;
mul.f32 f470, f398, f461;
fma.rn.f32 f471, f399, f459, f470;
mul.f32 f472, f393, f471;
fma.rn.f32 f473, f469, f392, f472;
mul.f32 f474, f392, f471;
mul.f32 f475, f469, f393;
sub.f32 f476, f475, f474;
mul.f32 f477, f398, f469;
mul.f32 f478, f399, f471;
sub.f32 f479, f477, f478;
mul.f32 f480, f398, f471;
fma.rn.f32 f481, f399, f469, f480;
mul.f32 f482, f397, f481;
fma.rn.f32 f483, f479, f396, f482;
mul.f32 f484, f396, f481;
mul.f32 f485, f479, f397;
sub.f32 f486, f485, f484;
shl.b32 r14, r13, 2;
add.s32 r15, r8, r14;
barrier.sync 0;
mad.lo.s32 r16, r11, 400, r15;
st.shared.f32 [r16], f378;
st.shared.f32 [r16+40], f403;
st.shared.f32 [r16+80], f413;
st.shared.f32 [r16+120], f423;
st.shared.f32 [r16+160], f433;
st.shared.f32 [r16+200], f443;
st.shared.f32 [r16+240], f453;
st.shared.f32 [r16+280], f463;
st.shared.f32 [r16+320], f473;
st.shared.f32 [r16+360], f483;
barrier.sync 0;
ld.shared.f32 f487, [r10];
ld.shared.f32 f488, [r10+400];
ld.shared.f32 f489, [r10+800];
ld.shared.f32 f490, [r10+1200];
ld.shared.f32 f491, [r10+1600];
ld.shared.f32 f492, [r10+2000];
ld.shared.f32 f493, [r10+2400];
ld.shared.f32 f494, [r10+2800];
ld.shared.f32 f495, [r10+3200];
ld.shared.f32 f496, [r10+3600];
barrier.sync 0;
st.shared.f32 [r16], f379;
st.shared.f32 [r16+40], f406;
st.shared.f32 [r16+80], f416;
st.shared.f32 [r16+120], f426;
st.shared.f32 [r16+160], f436;
st.shared.f32 [r16+200], f446;
st.shared.f32 [r16+240], f456;
st.shared.f32 [r16+280], f466;
st.shared.f32 [r16+320], f476;
st.shared.f32 [r16+360], f486;
barrier.sync 0;
ld.shared.f32 f497, [r10];
ld.shared.f32 f498, [r10+400];
ld.shared.f32 f499, [r10+800];
ld.shared.f32 f500, [r10+1200];
ld.shared.f32 f501, [r10+1600];
ld.shared.f32 f502, [r10+2000];
ld.shared.f32 f503, [r10+2400];
ld.shared.f32 f504, [r10+2800];
ld.shared.f32 f505, [r10+3200];
ld.shared.f32 f506, [r10+3600];
add.f32 f507, f489, f495;
add.f32 f508, f487, f507;
add.f32 f509, f491, f493;
add.f32 f510, f509, f508;
add.f32 f511, f499, f505;
add.f32 f512, f497, f511;
add.f32 f513, f501, f503;
add.f32 f514, f513, f512;
fma.rn.f32 f515, f507, 0f3E9E377A, f487;
mul.f32 f516, f509, 0f3F4F1BBD;
sub.f32 f517, f515, f516;
sub.f32 f518, f499, f505;
mul.f32 f519, f518, 0f3F737871;
sub.f32 f520, f501, f503;
fma.rn.f32 f521, f520, 0f3F167918, f519;
sub.f32 f522, f517, f521;
add.f32 f523, f521, f517;
mul.f32 f524, f507, 0f3F4F1BBD;
sub.f32 f525, f487, f524;
fma.rn.f32 f526, f509, 0f3E9E377A, f525;
mul.f32 f527, f518, 0f3F167918;
mul.f32 f528, f520, 0f3F737871;
sub.f32 f529, f527, f528;
sub.f32 f530, f526, f529;
add.f32 f531, f529, f526;
fma.rn.f32 f532, f511, 0f3E9E377A, f497;
mul.f32 f533, f513, 0f3F4F1BBD;
sub.f32 f534, f532, f533;
sub.f32 f535, f489, f495;
mul.f32 f536, f535, 0f3F737871;
sub.f32 f537, f491, f493;
fma.rn.f32 f538, f537, 0f3F167918, f536;
add.f32 f539, f538, f534;
sub.f32 f540, f534, f538;
mul.f32 f541, f511, 0f3F4F1BBD;
sub.f32 f542, f497, f541;
fma.rn.f32 f543, f513, 0f3E9E377A, f542;
mul.f32 f544, f535, 0f3F167918;
mul.f32 f545, f537, 0f3F737871;
sub.f32 f546, f544, f545;
add.f32 f547, f546, f543;
sub.f32 f548, f543, f546;
add.f32 f549, f490, f496;
add.f32 f550, f488, f549;
add.f32 f551, f492, f494;
add.f32 f552, f551, f550;
add.f32 f553, f500, f506;
add.f32 f554, f498, f553;
add.f32 f555, f502, f504;
add.f32 f556, f555, f554;
fma.rn.f32 f557, f549, 0f3E9E377A, f488;
mul.f32 f558, f551, 0f3F4F1BBD;
sub.f32 f559, f557, f558;
sub.f32 f560, f500, f506;
mul.f32 f561, f560, 0f3F737871;
sub.f32 f562, f502, f504;
fma.rn.f32 f563, f562, 0f3F167918, f561;
sub.f32 f564, f559, f563;
add.f32 f565, f563, f559;
mul.f32 f566, f549, 0f3F4F1BBD;
sub.f32 f567, f488, f566;
fma.rn.f32 f568, f551, 0f3E9E377A, f567;
mul.f32 f569, f560, 0f3F167918;
mul.f32 f570, f562, 0f3F737871;
sub.f32 f571, f569, f570;
sub.f32 f572, f568, f571;
add.f32 f573, f571, f568;
fma.rn.f32 f574, f553, 0f3E9E377A, f498;
mul.f32 f575, f555, 0f3F4F1BBD;
sub.f32 f576, f574, f575;
sub.f32 f577, f490, f496;
mul.f32 f578, f577, 0f3F737871;
sub.f32 f579, f492, f494;
fma.rn.f32 f580, f579, 0f3F167918, f578;
add.f32 f581, f580, f576;
sub.f32 f582, f576, f580;
mul.f32 f583, f553, 0f3F4F1BBD;
sub.f32 f584, f498, f583;
fma.rn.f32 f585, f555, 0f3E9E377A, f584;
mul.f32 f586, f577, 0f3F167918;
mul.f32 f587, f579, 0f3F737871;
sub.f32 f588, f586, f587;
add.f32 f589, f588, f585;
sub.f32 f590, f585, f588;
mul.f32 f591, f564, 0f3F4F1BBD;
mul.f32 f592, f581, 0f3F167918;
sub.f32 f593, f591, f592;
mul.f32 f594, f581, 0f3F4F1BBD;
fma.rn.f32 f595, f564, 0f3F167918, f594;
mul.f32 f596, f572, 0f3E9E377A;
mul.f32 f597, f589, 0f3F737871;
sub.f32 f598, f596, f597;
mul.f32 f599, f589, 0f3E9E377A;
fma.rn.f32 f600, f572, 0f3F737871, f599;
mul.f32 f601, f573, 0fBE9E377A;
mul.f32 f602, f590, 0f3F737871;
sub.f32 f603, f601, f602;
mul.f32 f604, f590, 0fBE9E377A;
fma.rn.f32 f605, f573, 0f3F737871, f604;
mul.f32 f606, f565, 0fBF4F1BBD;
mul.f32 f607, f582, 0f3F167918;
sub.f32 f608, f606, f607;
mul.f32 f609, f582, 0fBF4F1BBD;
fma.rn.f32 f610, f565, 0f3F167918, f609;
add.f32 %0, f510, f552;
add.f32 %1, f514, f556;
add.f32 %3, f539, f595;
add.f32 %2, f522, f593;
add.f32 %5, f547, f600;
add.f32 %4, f530, f598;
add.f32 %7, f548, f605;
add.f32 %6, f531, f603;
add.f32 %9, f540, f610;
add.f32 %8, f523, f608;
sub.f32 %10, f510, f552;
sub.f32 %11, f514, f556;
sub.f32 %13, f539, f595;
sub.f32 %12, f522, f593;
sub.f32 %15, f547, f600;
sub.f32 %14, f530, f598;
sub.f32 %17, f548, f605;
sub.f32 %16, f531, f603;
sub.f32 %19, f540, f610;
sub.f32 %18, f523, f608;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y));
};


#endif
